import sys
from tqdm import tqdm
import json
import time

import logging
import json
import requests

import os
directory_to_add = os.getcwd()
if directory_to_add not in sys.path:
    sys.path.append(directory_to_add + "/backend")
from cognitive_kernel.base_model_connection import BaseModelConnection, ChatGPTConnection

SYSTEM_PROMPT_TEXT = """As an evaluator, you will be presented with three primary components to assist you in your role:

1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).

2. Webpages accessibility tree: This is a series of webpages' representation showing the trajectory of performing a web task. It serves as a proof of the actions taken in response to the instruction.

3. Result Response: This is a textual response obtained after the execution of the web task. It serves as textual result in response to the instruction.

-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
-- You SHOULD NOT make assumptions based on information not presented in the accessibility tree when comparing it to the instructions.
-- Your primary responsibility is to conduct a thorough assessment of the web task instruction against the outcome depicted in the accessibility tree and in the response, evaluating whether the actions taken align with the given instructions.
-- NOTE that the instruction may involve more than one task, for example, locating the garage and summarizing the review. Failing to complete either task, such as not providing a summary, should be considered unsuccessful.
-- NOTE that the accessibility tree is authentic, but the response provided by LLM is generated at the end of web browsing, and there may be discrepancies between the response and the accessibility tree.
-- Note the difference: 1) Result response may contradict the accessibility tree, then the content of the accessibility tree prevails, 2) The content in the Result response is not mentioned on the accessibility tree, choose to believe the content.

You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the task has been successfully accomplished, either as 'SUCCESS' or 'NOT SUCCESS'."""

USER_PROMPT_TEXT = """TASK: <task>
Result Response: <answer>
<num> accessibility trees at the end: """

SYSTEM_PROMPT_STEP_TEXT = """As an evaluator, you will be presented with three primary components to assist you in your role:

1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).

2. Webpage Accessibility Tree: This is a representation of webpage's elements at the latest step. The latest action is made based on the information in the tree.

3. The latest action: There is a rationale describing the reason for performing the latest action and a short string representing the latest action that was executed.

-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
-- Your primary responsibility is to conduct a thorough assessment of the latest action based on the history actions and the latest accessibility tree, evaluating whether the latest action helps completing the task.
-- NOTE that the web accessibility tree is authentic, but the rationale and action are generated by an LLM, and there may be discrepancies between the text and the accessibility tree. If the rationale contradict the accessibility tree, then the content of the accessibility tree prevails.
-- If an action results in no change in the accessibility tree, it should be considered as not helpful.

You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the latest action helps completing the task. Then, you provide suggestions in how to perform the correct action on this step. You final verdict should be one of the following: 

'GOOD': The latest action was the most intuitive action to take from previous step, and the result screenshot shows good progress.

'FAIR': The latest action was a reasonable action to try, and there might be other alternatives, and the result screenshot may or may not show some progress.

'POOR': The latest action makes little sense or is not helpful, and there is no progress in completing the task

The output format is: 

THOUGHT: [reasons on how you arrive at the final evaluation]
EVALUATION: GOOD/FAIR/POOR
FEEDBACK: [the suggestions for performing the latest action based on the current accessibility tree]
"""

SYSTEM_PROMPT_START_TEXT = """As an evaluator, you will be presented with two primary components to assist you in your role:

1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).

2. The generated action: something like `CallWeb(query=generated_query, target_url="https://www.google.com/")` or `Direct Answering`. 

`CallWeb` means a function that search on a certain website using the query. `Direct answer` means using the parametric knowledge in the language model to answer the question directly.

-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
-- Your primary responsibility is to conduct a thorough assessment of the latest action based on the original task query. 
-- Specifically, you should evaluate based on three angles: 1) whether to use callweb or direct answering. 2) whether the generated_query is correct and help decompose the original task query to web tasks. 3) whether the target_url is correct. 

You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the latest action helps completing the task. Then, you provide suggestions in how to perform the correct action on this step. You final verdict should be one of the following: 

'GOOD': The latest action was the most intuitive action to take from previous step.

'FAIR': The latest action was a reasonable action to try, and there might be other alternatives.

'POOR': The latest action makes little sense or is not helpful, and there is no progress in completing the task

The output format is: 

THOUGHT: [reasons on how you arrive at the final evaluation]
EVALUATION: GOOD/FAIR/POOR
FEEDBACK: [the suggestions for performing the latest action]
"""

USER_PROMPT_STEP_TEXT = """TASK: <task>

Accessibility Tree: <tree>

The latest action: <action>"""

SYSTEM_PROMPT_TREE_SEARCH = """As an evaluator, you will be presented with two primary components to assist you in your role:

1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).

2. The generated action: something like `CallWeb(query=generated_query, target_url="https://www.google.com/")` or `Direct Answering`. 

3. The process of simulation when the generated action is performed, together with the results.For example, when the task instruction is `Find the price of a Porche 911`, and the action is to go to google.com to search `Porche 911 price`. We provide the accessibility after the final action (e.g., a google page listing the price of a Porche 911), and the results generated by the model (the price, such as $100,000).

`CallWeb` means a function that search on a certain website using the query. `Direct answer` means using the parametric knowledge in the language model to answer the question directly.

-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
-- Your primary responsibility is to conduct a thorough assessment of the latest action based on the original task query. 
-- Specifically, you should evaluate based on three angles: 1) whether to use callweb or direct answering. 2) whether the generated_query is correct and help decompose the original task query to web tasks. 3) whether the target_url is correct. 

You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the latest action helps completing the task. Then, you provide suggestions in how to perform the correct action on this step. You final verdict should be one of the following: 

'GOOD': The latest action was the most intuitive action to take from previous step.

'FAIR': The latest action was a reasonable action to try, and there might be other alternatives.

'POOR': The latest action makes little sense or is not helpful, and there is no progress in completing the task

The output format is: 

THOUGHT: [reasons on how you arrive at the final evaluation]
EVALUATION: GOOD/FAIR/POOR
FEEDBACK: [the suggestions for performing the latest action]
"""

USER_PROMPT_TREE_SEARCH = """TASK: <task>

Current Accessibility Tree: <tree>

The latest action: <action>

The simulation result:

The final simulated accessibility tree: <final_tree>

The final simulated result: <final_result>
"""

import re
def parse_eval_output(output):
    '''
        expected output format: 

        THOUGHT: ...
        EVALUATION: ...
        FEEDBACK: ...
    '''

    pattern = r'THOUGHT:\s*(.*?)\s*(?:EVALUATION:\s*(.*?))?\s*(?:FEEDBACK:\s*(.*?))?$'
    
    # Use re.search to find matches
    match = re.search(pattern, output, re.IGNORECASE)
    
    if match:
        thought = match.group(1) if match.group(1) else None
        evaluation = match.group(2) if match.group(2) else None
        feedback = match.group(3) if match.group(3) else None
    else:
        thought, evaluation, feedback = None, None, None
    
    return {
        'THOUGHT': thought,
        'EVALUATION': evaluation,
        'FEEDBACK': feedback
    }

if __name__ == "__main__":

    import os
    from openai import OpenAI

    # api_key = os.environ.get("OPENAI_API_KEY")
    # base_url = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

    # original_chatgpt_client = OpenAI(api_key=api_key)

    # client = OpenAI(
    #         base_url=base_url,
    #         api_key=api_key,
    #     )

    chatgpt_connection = ChatGPTConnection("gpt-4o-mini")


    #### boston case

    task = "Seven Brief Lessons on Physics was written by an Italian physicist that has worked in France since what year?"

    carlo_rovelli_case = [
        {"role": "system", 
        "content": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. Here's the information you'll have: The user's objective: This is the task you're trying to complete. The current observation (web page's accessibility tree): This is a simplified representation of the webpage, providing key information. The open tabs: These are the tabs you have open. The previous actions: You can refer to the conversation history with the user to see the actions you have taken. It may be helpful to track your progress. The actions you can perform are the following: `click [id]`: This action clicks on an element with a specific id on the webpage. `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0. `scroll [direction=down|up]`: Scroll the page up or down. `wait`: Wait for the page to load, with a duration of 5 seconds. `goback`: Navigate to the previously viewed page. `restart`: Navigate to the Google search homepage. When you can't find information in some websites, try starting over from Google search. `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"N/A\" in the bracket. To be successful, it is very important to follow the following rules: 1. You should only issue an action that is valid given the current observation. For example, you should NOT type into buttons or click on statictext. 2. You should only issue one action at a time. 3. STRICTLY Avoid repeating the same action if the webpage remains unchanged. You may have selected the wrong web element or numerical label. Continuous use of the Wait is also NOT allowed. 4. Issue stop action when you think you have achieved the objective. Don't generate anything after stop. 5. If you ever need to login, login with Google, use cognitivekernel@gmail.com as the email and IsAwesome as the password. mafinder189@gmail.com is the recovery email, enter it when required. Try to skip any follow-up questions that may appear after logging in. Your reply should strictly follow the format: Thought: {Your brief thoughts (briefly summarize the info that will help complete the task)} Action: ```{the next action you choose to take}```"},
        {"role": "user",
        "content":"<|im_omitted|>"},
        {"role": "assistant",
        "content": "Thought: To find the answer, I need to search for the book \"Seven Brief Lessons on Physics\" and find information about the author's background. The Google search homepage is currently displayed, and I should use the search textbox to input my query. Action: ```type [6] [Seven Brief Lessons on Physics author] [1]```"},
        {"role": "user",
        "content":"<|im_omitted|>"},
        {"role":"assistant",
        "content": "Thought: The search results page shows that \"Seven Brief Lessons on Physics\" was written by Carlo Rovelli, an Italian physicist. However, the task specifically asks for the year Rovelli started working in France. This information is not directly visible in the provided observation. The Wikipedia link might contain detailed information about Rovelli's career, including when he started working in France. Action: ```click [39]```"},
        {"role": "user",
        "content":"OBSERVATION: Tab 0 (current): Google [1] RootWebArea 'Carlo Rovelli - Wikipedia' focused: true [2] StaticText 'Jump to content' [3] navigation 'Site' [4] button 'Main menu' hasPopup: menu [5] link 'Wikipedia The Free Encyclopedia' [6] link 'Search' keyshortcuts: Alt+f [7] navigation 'Personal tools' [8] link 'Donate' [9] navigation 'Appearance' [10] button 'Appearance' hasPopup: menu [11] link 'Create account' [12] link 'Log in' keyshortcuts: Alt+o [13] button 'Personal tools' hasPopup: menu [14] link 'Photograph a historic site, help Wikipedia, and win a prize. Participate in the world's largest photography competition this month! Learn more' [15] navigation 'Contents' [16] button 'Toggle the table of contents' hasPopup: menu [17] button 'Go to an article in another language. Available in 24 languages' hasPopup: menu [18] navigation 'Namespaces' [19] link 'Article' keyshortcuts: Alt+c [20] link 'Talk' keyshortcuts: Alt+t [21] navigation 'Views' [22] link 'Read' [23] link 'Edit' keyshortcuts: Alt+e [24] link 'View history' keyshortcuts: Alt+h [25] navigation 'Page tools' [26] button 'Tools' hasPopup: menu [27] StaticText 'From Wikipedia, the free encyclopedia' [28] columnheader 'Carlo Rovelli' required: false [29] cell 'Rovelli in 2017' [30] StaticText ' (born 3 May 1956) is an Italian ' [31] link 'theoretical physicist' [32] StaticText ' and writer who has worked in Italy, the United States, France, and Canada.' [33] link '[1]' [34] StaticText ' He is currently Emeritus Professor at the Centre de Physique Theorique of Marseille in France, a Distinguished Visiting Research Chair at the ' [35] link 'Perimeter Institute' [36] StaticText ',' [37] link '[2]' [38] StaticText ' core member of the Rotman Institute of Philosophy of ' [39] link 'Western University' [40] StaticText ' in Canada,' [41] link '[3]' [42] StaticText ' and Fractal Faculty of the Santa Fe Institute in The United States.' [43] link '[4]' [44] StaticText 'Rovelli works mainly in the field of ' [45] link 'quantum gravity' [46] StaticText ' and is a founder of the theory of ' [47] link 'loop quantum gravity' [48] StaticText '. He has also worked in the ' [49] link 'history' [50] StaticText ' and ' [51] link 'philosophy of science' [52] StaticText ', formulating the ' [53] link 'Relational quantum mechanics' [54] StaticText ' and the notion of ' [55] link 'Thermal Time' [56] StaticText '. He collaborates with several Italian newspapers, including the cultural supplements of the ' [57] link 'Corriere della Sera' [58] StaticText ', ' [59] link 'Il Sole 24 Ore' [60] StaticText ', and ' [61] link 'La Repubblica' [62] StaticText '. His ' [63] link 'popular science' [64] StaticText ' book, ' [65] link 'Seven Brief Lessons on Physics' [66] StaticText ', was originally published in Italian in 2014. It has sold over a million copies worldwide.' [67] link '[5]' [68] StaticText ' In 2019, he was included by ' [69] link 'Foreign Policy' [70] StaticText ' magazine in the list of the 100 most influential global thinkers.' [71] link '[6]' [72] StaticText ' In 2021, he was included by ' [73] link 'Prospect' [74] StaticText ' magazine in the list of the 50 world's top thinkers.' [75] link '[7]' OBJECTIVE: Seven Brief Lessons on Physics was written by an Italian physicist that has worked in France since what year?"},
        {"role": "assistant",
        "content": "Thought: The Wikipedia page for Carlo Rovelli mentions that he has worked in Italy, the United States, France, and Canada. It specifically states that he is currently Emeritus Professor at the Centre de Physique Theorique of Marseille in France. However, it does not provide the exact year he started working in France. Given the information available, it seems that Rovelli has had a distinguished career with appointments in multiple countries, but the specific year he began working in France is not explicitly mentioned. Action: ```stop [Carlo Rovelli has worked in France, among other countries, but the exact year he started working in France is not explicitly mentioned in the available information.]```"},
    ]

    prompt = f"""TASK: {task}

    Accessibility Tree: {carlo_rovelli_case[-2]['content']}

    The rationale of the latest action : {carlo_rovelli_case[-1]['content'].split("Action:")[0]}

    The latest action : {carlo_rovelli_case[-1]['content'].split("Action:")[1]}
    """

    eval_output = chatgpt_connection.get_response_search_test([
            {"role": "system", 
            "content": SYSTEM_PROMPT_STEP_TEXT},
            {"role": "user", 
            "content": prompt}
        ])
    print("evaluation:", eval_output)
        
    print(parse_eval_output(eval_output))


